{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18</td>\n",
       "      <td>4141</td>\n",
       "      <td>Mark Waters</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65</td>\n",
       "      <td>208</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>65</td>\n",
       "      <td>353</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>65</td>\n",
       "      <td>521</td>\n",
       "      <td>noir thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>65</td>\n",
       "      <td>592</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId            tag\n",
       "0      18     4141    Mark Waters\n",
       "1      65      208      dark hero\n",
       "2      65      353      dark hero\n",
       "3      65      521  noir thriller\n",
       "4      65      592      dark hero"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tag = pd.read_csv('./data/tag.csv',usecols=[0,1,2])\n",
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 465564 entries, 0 to 465563\n",
      "Data columns (total 3 columns):\n",
      " #   Column   Non-Null Count   Dtype \n",
      "---  ------   --------------   ----- \n",
      " 0   userId   465564 non-null  int64 \n",
      " 1   movieId  465564 non-null  int64 \n",
      " 2   tag      465548 non-null  object\n",
      "dtypes: int64(2), object(1)\n",
      "memory usage: 10.7+ MB\n"
     ]
    }
   ],
   "source": [
    "df_tag.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 删除出现次数小于20次标签，以及对应数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['sci-fi', 'based on a book', 'atmospheric', 'comedy', 'action', 'surreal', 'BD-R', 'twist ending', 'funny', 'dystopia', 'stylized', 'quirky', 'dark comedy', 'classic', 'psychology', 'fantasy', 'time travel', 'romance', 'visually appealing', 'disturbing', 'aliens', 'thought-provoking', 'social commentary', 'Nudity (Topless)', 'violence', 'drugs', 'Criterion', 'true story', 'nudity (topless)', 'adventure', 'animation', 'imdb top 250', 'space', 'CLV', 'dark', 'superhero', 'black comedy', 'post-apocalyptic', 'World War II', 'Betamax', 'cult film', 'satire', 'tense', 'thriller', 'drama', 'predictable', 'politics', 'adapted from:book', 'bittersweet', 'horror', 'based on a true story', 'revenge', 'boring', 'cinematography', 'serial killer', 'great soundtrack', 'music', 'coming of age', 'high school', 'comic book', 'religion', 'violent', 'zombies', 'DVD-Video', 'mental illness', 'franchise', 'anime', 'dreamlike', 'alternate reality', 'friendship', 'murder', 'crime', 'war', 'Quentin Tarantino', 'nonlinear', 'psychological', 'Oscar (Best Picture)', 'parody', 'suspense', 'less than 300 ratings', 'black and white', 'R', \"erlend's DVDs\", 'Johnny Depp', 'remake', 'magic', 'martial arts', 'Disney', 'multiple storylines', 'Brad Pitt', 'Bruce Willis', 'witty', \"Tumey's DVDs\", 'humorous', 'seen more than once', 'inspirational', 'stupid', 'mystery', 'family', 'robots', 'police', 'hilarious', 'slow', 'Tom Hanks', 'history', 'DVD-RAM', 'organized crime', 'great acting', 'musical', 'Morgan Freeman', 'Japan', 'sequel', 'Pixar', 'ensemble cast', 'vampires', 'New York City', 'overrated', 'mindfuck', 'documentary', 'espionage', 'Nudity (Full Frontal)', 'Nudity (Topless - Brief)', 'future', 'artificial intelligence', 'Nudity (Topless - Notable)', 'philosophy', '70mm', 'corruption', 'historical', 'Steven Spielberg', 'gay', 'beautiful', 'Edward Norton', 'satirical', 'movie to see', 'emotional', 'sexuality', 'Bill Murray', 'assassin', 'dialogue', 'Christian Bale', 'soundtrack', 'father-son relationship', 'fairy tale', 'original', 'Leonardo DiCaprio', 'sports', 'relationships', 'England', 'prison', 'lesbian', 'racism', 'Clint Eastwood', 'road trip', 'military', 'Tim Burton', 'acting', 'British', 'loneliness', 'rape', 'chick flick', 'story', 'Samuel L. Jackson', 'weird', 'Nudity (Full Frontal - Notable)', 'Jim Carrey', 'bad acting', 'silly', 'Tom Cruise', 'BD-Video', 'depressing', 'Matt Damon', 'Action', 'love', 'gore', 'Robert Downey Jr.', 'Stanley Kubrick', 'heist', 'Natalie Portman', 'whimsical', 'Robert De Niro', 'Harrison Ford', 'ghosts', 'musicians', 'small town', 'conspiracy', 'Christmas', 'biography', 'great ending', 'Studio Ghibli', 'creepy', 'Robin Williams', 'talking animals', 'philosophical', 'touching', 'might like', 'teen', 'cyberpunk', 'gothic', 'Martin Scorsese', 'Bechdel Test:Fail', 'Arnold Schwarzenegger', 'nudity (full frontal)', 'Coen Brothers', 'Jack Nicholson', 'reviewed', 'suicide', 'Nicolas Cage', 'Al Pacino', 'To See', 'campy', 'survival', 'library', 'Scarlett Johansson', 'clever', 'narrated', 'Kevin Spacey', 'reflective', 'National Film Registry', 'science fiction', 'Christianity', 'long', 'Romance', 'complicated', 'controversial', 'easily confused with other movie(s) (title)', 'George Clooney', 'based on a play', 'adultery', 'Nazis', 'directorial debut', 'heartwarming', 'surrealism', 'noir thriller', 'apocalypse', 'gritty', 'futuristic', 'rock and roll', 'Keanu Reeves', 'memory', 'Alfred Hitchcock', 'virtual reality', 'Stephen King', 'Woody Allen', 'queer', 'Oscar (Best Cinematography)', 'DVD-R', 'supernatural', 'netflix', 'dark humor', 'children', 'love story', 'hallucinatory', 'perrot library', 'visually stunning', 'Paris', 'ClearPlay', 'low budget', 'Comedy', 'death', 'France', 'In Netflix queue', 'Bechdel Test:Pass', \"Can't remember\", 'dvd', 'Bibliothek', 'Netflix Finland', 'terrorism', 'too long', 'Will Smith', 'journalism', '1980s', 'cliche', 'short', 'western', \"memasa's movies\", 'Oscar (Best Actor)', 'sad', 'on dvr', 'Philip Seymour Hoffman', 'intense', 'plot twist', 'ending', 'feel-good', 'Oscar (Best Directing)', 'French', 'Hayao Miyazaki', 'torture', 'cars', 'cute', 'Russell Crowe', 'Marvel', 'bleak', 'melancholy', 'Nudity (Rear)', 'treasure', 'unrealistic', 'fantasy world', 'fun', 'cynical', 'cerebral', 'movie business', 'homosexuality', 'Mel Gibson', 'mockumentary', 'pirates', 'boxing', 'Michael Caine', 'Philip K. Dick', 'intelligent', 'realistic', 'Sean Connery', 'Bob*ola', 'dreams', 'steampunk', 'pixar', 'mafia', 'Atmospheric', 'cheesy', 'Jude Law', 'existentialism', 'James Bond', 'insanity', 'business', 'storytelling', 'Post apocalyptic', 'car chase', 'Gary Oldman', 'Will Ferrell', 'bad science', 'kidnapping', 'seen at the cinema', 'british', 'gangsters', 'made for TV', 'adapted from:comic', 'poignant', 'crude humor', 'Star Trek', 'VHS', 'beautifully filmed', 'arnold', 'Ewan McGregor', 'Adam Sandler', 'cannibalism', 'irreverent', 'based on a comic', 'nazis', 'cult classic', 'mentor', 'space travel', 'spielberg', 'Africa', 'romantic', 'humor', 'computers', 'enigmatic', 'british comedy', 'NO_FA_GANES', 'suspenseful', 'claustrophobic', 'not funny', 'imagination', 'Oscar (Best Actress)', 'Zooey Deschanel', 'visceral', 'neo-noir', 'sword fight', 'vampire', 'plot holes', 'plot', 'excellent script', 'Angelina Jolie', 'History', 'Denzel Washington', 'dysfunctional family', 'Oscar (Best Supporting Actor)', 'Liam Neeson', 'dance', 'scary', 'London', 'DVD', 'Anthony Hopkins', 'super-hero', 'Joseph Gordon-Levitt', 'Keira Knightley', 'powerful ending', 'male nudity', 'DIVX', 'mathematics', 'science', 'Simon Pegg', 'artistic', 'epic', 'genetics', 'dogs', 'Shakespeare', 'anti-hero', 'bad plot', 'prostitution', 'stranded', 'Classic', 'own', 'sentimental', 'Batman', 'Uma Thurman', 'Drama', 'Sylvester Stallone', 'Heath Ledger', 'seen', 'nature', 'Nicole Kidman', 'disability', 'big budget', '06/11', 'AFI 100', 'addiction', 'monster', 'deadpan', 'slow paced', 'nostalgic', 'based on a TV show', 'trains', 'meditative', 'slasher', 'Holocaust', 'police corruption', 'btaege', 'dark hero', 'Jason Statham', 'masterpiece', 'Kevin Smith', 'Jack Black', 'characters', 'vigilante', 'romantic comedy', 'Sandra Bullock', 'Wes Anderson', 'dancing', 'understated', 'notable soundtrack', '1960s', 'conspiracy theory', 'Ben Stiller', 'Mafia', 'Monty Python', 'Owen Wilson', 'Seth Rogen', 'claymation', 'brutal', 'Los Angeles', 'detective', 'beautiful scenery', 'courtroom', 'jesus', 'christianity', 'John Cusack', 'college', 'kung fu', 'spoof', 'writers', 'Steve Carell', 'alien invasion', '1970s', 'David Lynch', 'John Malkovich', 'amnesia', 'film noir', '19th century', 'Funny', 'Dustin Hoffman', 'Akira Kurosawa', 'Ridley Scott', 'Biography', 'Hugh Jackman', 'Vietnam War', 'owned', 'pregnancy', 'sex', 'propaganda', 'cgi', 'samurai', '03/11', 'medieval', 'India', 'Jodie Foster', '05/11', 'bad ending', '01/11', 'Julia Roberts', 'Based on a TV show', 'biopic', 'siblings', 'Christopher Nolan', 'scenic', '11/10', 'teen movie', 'Ben Affleck', '02/11', 'disappointing', 'prison escape', 'ridiculous', 'based on book', 'twists & turns', 'Mark Wahlberg', 'Jackie Chan', 'lyrical', 'Meryl Streep', 'animals', 'dialogue driven', 'androids', \"Eric's Dvds\", 'happy ending', 'sweet', 'father daughter relationship', 'original plot', 'Terry Gilliam', 'remade', 'Christopher Walken', 'Watched', 'Star Wars', 'Peter Jackson', 'good dialogue', 'script', 'women', '11/11', 'Roman Polanski', 'surprise ending', 'eerie', 'adapted from:play', 'PG-13', 'deliberate', 'anti-war', 'Annemari', 'Hugh Grant', 'confusing', 'Anamorphic Blow-Up', 'ominous', 'unique', 'stylish', 'Tommy Lee Jones', 'Cameron Diaz', 'Kirsten Dunst', 'dragons', 'screwball comedy', 'intellectual', '007', 'Ellen Page', 'girlie movie', 'depression', 'Stereoscopic 3-D', 'Woody Harrelson', 'Australia', 'dinosaurs', 'Special Effects', 'food', 'aviation', 'dramatic', 'political', 'island', 'George Lucas', 'paranoid', 'talky', 'Below R', 'watch the credits', 'Emma Stone', 'stand-up comedy', 'Middle East', 'animated', 'Jean Reno', 'etaege', 'haunted house', 'courtroom drama', 'AFI 100 (Laughs)', 'obsession', 'want to see again', 'library vhs', 'motorcycle', 'Anne Hathaway', 'USA', 'environmental', 'childhood', 'lawyers', 'erotic', 'poverty', 'guns', 'brutality', 'Fantasy', 'Futuristmovies.com', 'John Travolta', 'spy', 'Jeff Bridges', '12/10', 'photography', 'time loop', 'Charlie Kaufman', 'television', 'technology', 'Ryan Gosling', 'computer animation', 'Latin America', '1930s', 'absurd', 'baseball', 'genius', 'submarine', 'child abuse', 'divorce', 'Jake Gyllenhaal', 'Milla Jovovich', 'paranoia', 'assassination', 'sexual', 'hackers', 'China', 'fighting', 'audience intelligence underestimated', 'werewolves', 'Sean Penn', 'New York', 'christian', 'menacing', 'feel good movie', 'Oscar (Best Supporting Actress)', 'great cinematography', 'Jane Austen', 'video game adaptation', 'Sigourney Weaver', 'video games', 'Alan Rickman', 'redemption', 'wedding', 'stop motion', 'David Fincher', 'investigation', 'Jennifer Aniston', 'football', 'scifi', 'immortality', 'goofy', '12/11', 'alcoholism', 'Adventure', 'M. Night Shyamalan', 'archaeology', 'South America', 'Kate Winslet', 'rebellion', 'marriage', 'revolution', 'Lars von Trier', '1950s', 'splatter', 'Quirky', '10/10', 'Gene Hackman', 'royalty', 'AFI 100 (Thrills)', 'suburbia', 'Colin Farrell', 'BGAB LRC', 'Robert Rodriguez', 'Vin Diesel', 'Crime', 'alter ego', 'not available from Netflix', 'strong female lead', 'dragon', 'good acting', 'end of the world', 'off-beat comedy', 'Musical', 'Charlize Theron', 'not a movie', 'special effects', 'marijuana', 'Hitchcock', 'cross dressing', 'Eddie Murphy', 'fight scenes', 'Sam Rockwell', 'Jennifer Lawrence', \"so bad it's good\", 'interesting', 'Guy Ritchie', 'Horror', 'mythology', 'Kevin Costner', 'immigrants', 'pretentious', 'mars', 'Zach Galifianakis', 'quotable', 'culture clash', 'grim', 'Germany', 'Clive Owen', '01/12', 'heartbreaking', 'james bond', 'Dark', 'forceful', 'justice', 'Catholicism', 'Animation', 'WWII', 'Paul Rudd', 'virus', 'dystopic future', 'mother-son relationship', 'nudity', 'spaghetti western', 'realism', 'action packed', 'Michael Cera', 'dark fantasy', 'incest', 'wry', 'Helena Bonham Carter', 'independent film', 'self discovery', 'bollywood', 'zombie', 'hitman', 'Mila Kunis', 'cold war', 'slapstick', 'Las Vegas', '3D', 'childish', 'AFI 100 (Cheers)', 'Sexualized violence', 'multiple roles', 'secrets', 'KAF', 'Biblical', 'gruesome', 'better than expected', 'adolescence', 'witch', 'Ireland', 'Best of Rotten Tomatoes: All Time', 'Gwyneth Paltrow', 'Disney animated feature', 'ocean', 'War', 'isolation', 'melancholic', 'road movie', 'Cate Blanchett', 'alien', 'biographical', 'Hollywood', 'great performances', 'gambling', 'short-term memory loss', 'Sci-Fi', 'spies', 'AFI 100 (Movie Quotes)', 'blindness', 'Colin Firth', 'strippers', 'Matthew McConaughey', 'Orson Welles', 'Golden Palm', 'demons', 'desert', 'con artists', 'oscar (best cinematography)', 'sexy', 'Amy Adams', 'complex characters', 'Oscar (Best Effects - Visual Effects)', 'Daniel Craig', '100 Essential Female Performances', 'over the top', 'unlikeable characters', 'Drew Barrymore', 'brothers', 'compassionate', 'intimate', 'Guillermo del Toro', 'ghosts/afterlife', 'Willem Dafoe', 'good versus evil', 'Ralph Fiennes', 'Rachel McAdams', 'Steve Buscemi', 'Ryan Reynolds', 'bad script', 'bullying', '04/11', 'Surreal', 'cheerleading', 'undercover cop', 'Underrated', 'Funny as hell', 'Peter Sellers', 'nerds', 'need to own', 'Vietnam', 'werewolf', 'schizophrenia', 'downbeat', 'confrontational', 'Reese Witherspoon', 'My movies', 'treasure hunt', 'thought provoking', 'Amazing Cinematography', 'Werner Herzog', 'Ingmar Bergman', 'Tarantino', 'James Cameron', 'Jonah Hill', 'fashion', 'boarding school', 'Oscar (Best Foreign Language Film)', 'ballet', 'Ennio Morricone', 'Julianne Moore', 'mad scientist', 'harry potter', 'James Stewart', 'Jet Li', 'jus+san', 'Robert Redford', 'infidelity', 'Michael Fassbender', 'strange', 'blood', 'pointless', 'twist', 'Orlando Bloom', 'Oscar Winner', 'World War I', 'Vincent Price', 'US President', 'Dreamworks', 'teenagers', 'add to prospects list', 'disaster', '08/10', 'literary adaptation', 'Sergio Leone', 'slavery', 'Adrien Brody', 'alternate universe', 'superheroes', 'bank robbery', 'Winona Ryder', 'physics', 'Michael Douglas', 'hacking', 'oscar (best directing)', 'CIA', 'fast paced', '03/10', 'Naomi Watts', 'nuclear war', 'stereotypes', '1920s', 'Italy', 'Ethan Hawke', 'christmas', 'moody', 'Joaquin Phoenix', 'Teen movie', 'great dialogue', 'IMAX Digital only', 'school', 'Audrey Hepburn', '18th century', 'biting', 'classical music', 'literate', 'Cary Grant', '2D animation', 'Viggo Mortensen', 'Dynamic CGI Action', 'redbox', 'Kristen Stewart', 'homophobia', 'James Franco', 'high fantasy', 'Humphrey Bogart', 'unintentional comedy', 'San Francisco', 'Jim Jarmusch', 'gangs', 'Daniel Day-Lewis', 'Rachel Weisz', 'Maggie Gyllenhaal', 'Scotland', 'cyborgs', 'surveillance', 'CGI', 'Scary Movies To See on Halloween', '80s', 'cloning', 'weak plot', 'Mel Brooks', \"Sven's to see list\", 'passionate', 'IMAX DMR', 'Brazil', 'Gfei own it', 'art', '06/10', 'vhs', 'Paul Newman', 'hotel', 'Michael Crichton', 'disease', 'super hero', 'Divx1', 'workplace', 'earnest', 'Patrick Stewart', 'David Cronenberg', 'PG13', 'Gay Lead Character', 'Forest Whitaker', 'Bond', 'Spherical Blow-Up', 'marvel', 'beautiful cinematography', 'transgender', 'IMAX DMR 3-D', 'colonialism', 'psychedelic', 'voyeurism', 'Oliver Stone', 'Michael J. Fox', 'Sci-fi', '05/10', 'coen brothers', 'youtube', 'Meg Ryan', 'family bonds', 'gangster', 'Nostalgia Critic', 'Vince Vaughn', 'pornography', 'New Zealand', 'French Film', 'wizards', 'amazing photography', 'Gerard Butler', 'Ben Kingsley', 'Kevin Bacon', 'somber', 'austere', 'soccer', 'Bradley Cooper', 'cia', 'Emma Thompson', 'product placement', 'Bruce Campbell', 'Antonio Banderas', 'Kurt Russell', 'afterlife', 'Jason Segel', 'REDBOX', 'orphans', 'Marlon Brando', 'Pierce Brosnan', 'wired 50 greatest soundtracks', 'vengeance', 'Mark Ruffalo', 'Film Noir', 'Michael Moore', 'PG', 'underrated', 'drinking', 'nostalgia', 'husband-wife relationship', 'Jennifer Connelly', 'Revenge', 'Netflix Streaming', 'Aardman', 'flashbacks', 'modern fantasy', 'jazz', 'Good Romantic Comedies', '01/10', 'heroine in tight suit', 'Mystery', 'unpredictable', 'Christopher Lloyd', 'Tolkien', 'John Goodman', 'Black comedy', 'entertaining', 'paranormal', 'complicated plot', 'indiana jones', 'Music', 'Steve Martin', 'Paul Giamatti', 'no plot', 'smart', 'Mexico', 'teacher', 'chase', 'android(s)/cyborg(s)', 'Kick-Butt Women', 'competition', 'James McAvoy', 'Jessica Alba', 'Sacha Baron Cohen', 'cult', 'atheism', 'farce', 'Sam Raimi', 'buddy movie', 'watched 2006', 'John Wayne', 'Luc Besson', 'Iran', 'bloody', 'should like', 'lavish', 'colourful', 'suicide attempt', 'predictable ending', 'Tragedy', 'Hannibal Lecter', 'Jean-Claude Van Damme', 'unfunny', 'private detective', 'good soundtrack', 'poker', 'weak ending', 'FIGHTING THE SYSTEM', 'German', 'jungle', 'unintentionally funny', 'Robert Downey Jr', 'Boring', 'french', 'shaky camera', 'Emma Watson', 'found footage', 'sci fi', 'cast', 'Iraq War', 'Takashi Miike', 'Edgar Award (Best Motion Picture)', 'Ron Perlman', 'slackers', 'crappy sequel', 'effects', 'John Carpenter', 'John Cleese', 'FEWER than 300 ratings', 'sweeping', 'must see', 'Cold War', 'Ashton Kutcher', 'Rome', 'affectionate', 'basketball', 'Emily Blunt', 'Hugo Weaving', 'england', 'Cult classic', 'holocaust', 'noir', 'cats', 'mountain climbing', 'Argentina', 'disney', 'drug abuse', 'Cillian Murphy', 'scope', 'AIDS', 'muppets', 'ninja', 'instant view', 'rousing', 'Mickey Rourke', 'light', 'Sherlock Holmes', 'abortion', 'Steven Soderbergh', 'Billy Bob Thornton', 'Benicio Del Toro', 'Don Cheadle', 'Spain', 'betrayal', 'family relationships', 'writing', 'Takeshi Kitano', 'Want', 'jim carrey', 'Val Kilmer', 'Finnish', 'Highly quotable', 'Richard Gere', 'blaxploitation', 'midlife crisis', 'brother-brother relationship', 'fascism', 'Kaiju', 'aging', 'Italian', 'twins', 'autism', 'Gene Wilder', 'space opera', 'double life', 'japan', 'historically inaccurate', 'Francis Ford Coppola', 'bisexual', 'natural disaster', 'feminism', 'Christoph Waltz', 'underdog', 'Ed Harris', 'heroin', '3d', 'book was better', 'Broadway', 'Texas', 'Russian', 'devil', 'realistic action', 'communism', 'explosions', 'dumb', 'Elijah Wood', 'Fritz Lang', 'criterion', 'first contact', 'character development', 'Oscar (Best Music - Original Song)', 'capitalism', 'silent movie', 'Berlin', 'geeks', 'spying', 'Russia', 'Jennifer Garner', 'South Africa', 'very good', 'greed', 'money', 'alone in the world', 'racing', '1990s', 'VistaVision', 'Predictable', 'Stoner Movie', 'post apocalyptic', 'unlikely friendships', 'Neil Patrick Harris', 'Documentary', 'charming', 'wistful', 'indie', 'wrongly accused', 'seen 2010', 'books', 'cartoon', 'Tobey Maguire', 'giant robots', 'wilderness', 'fanciful', 'camp', 'singing', 'Seen 2014', 'dull', 'Civil War', 'new york', 'monsters', 'BDSM', 'Exceptional Acting', 'Toshiro Mifune', 'TV', 'Catherine Zeta-Jones', 'harsh', 'stylized violence', 'not as good as the first', 'swashbuckler', 'DC Comics', 'John Hughes', 'Daniel Radcliffe', 'American Civil War', '09/10', 'internet', 'gay romance', 'music business', 'Michael Bay', 'imdb bottom 100', 'caper', 'Matthew Broderick', 'Thriller', 'Jeff Goldblum', 'life & death', 'Christopher Guest', 'rated-R', 'imaginary friend', 'Notable Nudity', 'morality', 'experimental', 'mutants', 'con men', 'Brian De Palma', 'ummarti2006', 'get', 'Tim Roth', 'Jesse Eisenberg', 'catastrophe', 'Greg Kinnear', 'cancer', 'Miyazaki', 'New Orleans', 'birds', 'civil war', 'Interesting', 'David Bowie', 'Mike Myers', 'alternate history', 'Darren Aronofsky', 'visual', 'Audrey Tautou', 'inventor', '12/09', 'Chan-wook Park', 'vigilantism', 'casino', 'Michel Gondry', 'bizarre', '007 (series)', 'tom hanks', 'joss whedon', 'foreign language', 'Chris Evans', 'Alaska', 'Paul Thomas Anderson', 'switching places', 'j netflix', 'High School', 'seen 2011', 'complex', 'Oscar (Best Music - Original Score)', 'hostage', 'costume drama', 'Christina Ricci', 'Jean-Luc Godard', 'Brendan Fraser', 'Steve McQueen', 'Handycam', 'possession', 'madcap', 'space program', 'Danny DeVito', 'Bible', 'japanese', 'no dialogue', 'avi', 'hallucination', 'Jason Bateman', 'death/fatality', 'wuxia', 'Overrated', 'sexual abuse', 'Hong Kong', 'HAUNTED BY THE PAST', 'mental hospital', 'Ricky Gervais', 'non-linear', 'open ending', 'school drama', 'cooking', 'swedish', 'manipulation', 'alternate endings', 'tear jerker', 'movielens top pick', 'moon', 'entirely dialogue', 'Beatles', '100 Greatest Movies', 'Nudity (Full Frontal - Brief)', 'Guy Pearce', 'Funniest Movies', 'writer', 'Quotable', 'mistaken identity', 'teens', 'genocide', 'William Shatner', 'epidemic', 'Depp & Burton', 'My DVDs', 'hulu', 'sad ending', 'foul language', 'FBI', 'Nick Frost', 'Soundtrack', 'meta', '55 movies every kid should see--Entertainment Weekly', 'Tokyo', 'Buster Keaton', 'Donald Sutherland', '3', 'Charlie Chaplin', '07/10', '02/10', 'B-movie', 'afternoon section', 'Australian', 'Marx Brothers', 'guilty pleasure', 'jealousy', 'mother daughter relationship', 'awesome', 'idealism', 'Whoopi Goldberg', 'Oscar Nom 2007', 'honest', 'patriotic', 'Suspense', 'race issues', 'Kiefer Sutherland', 'book', 'May-December romance', 'halloween', 'Halle Berry', 'sad but good', 'Helen Mirren', 'John Woo', 'asylum', 'police investigation', 'censorship', 'Chicago', 'lone hero', '11/09', 'working class', 'R:language', 'psychiatry', 'thoughtful', 'inspiring', 'Epic', 'Tim Robbins', 'episodic', 'love triangles', 'parallel universe', 'death penalty', 'occult', 'Kate Beckinsale', 'puppets', 'top 250 IMDB', 'tragedy', 'John Turturro', 'batman', 'Seen 2013', 'Ang Lee', 'train', 'David Mamet', 'vocalists', 'IMDB Top 250', 'adaptation', 'body horror', 'awesome soundtrack', 'nocturnal', 'ohsoso', 'virginity', 'elegiac', 'Dan Aykroyd', 'Mads Mikkelsen', 'wintry', 'robbery', 'view askew', 'on computer', 'Spanish', 'Jamie Foxx', 'my addition to ML', 'Business is the antagonist', 'Mindfuck', 'USA film registry', 'hospital', 'great cast', 'hip hop', 'Godzilla', 'Beautiful', 'Seen 2008', 'Sidney Lumet', 'director-screenwriter', 'good music', 'time', 'circus', 'macabre', 'Seen 2006', 'shallow', 'far future', 'vietnam war', 'poetry', 'Robert Zemeckis', 'Billy Wilder', 'Potential Oscar Nom', 'watched 2007', '9/11', 'character driven', 'minimalist', 'Hawaii', 'american idiocy', 'awkward', 'Own It', 'torrential downpour', 'Gay Character', 'intelligent sci-fi', 'NASA', 'Marvel Cinematic Universe', 'gory', 'Gay', 'social criticism', 'Geoffrey Rush', 'moral ambiguity', 'evolution', 'slick', 'airport', 'Ian McKellen', 'foreign', 'identity', 'kids', 'guilt', 'Friday night movie', 'secret service', 'johnny depp', 'Steven Seagal', 'nude black women', 'interracial romance', 'Robert Duvall', 'escape', 'painter', '\\\\\\\\\"found footage\\\\\"\\\\\"\"', 'George A. Romero', 'Jamie Lee Curtis', 'seen 2012', 'Renee Zellweger', 'Gregory Peck', 'Xmas theme', 'Tilda Swinton', 'urbane', 'gloomy', 'based on comic', 'prequel', 'Leonard Nimoy', 'gunfight', 'Not available from Netflix', 'paradox', 'Islam', 'pacing', 'corny', 'meaning of life', 'horses', 'adoption', 'Native Americans', 'travel', 'Sarah Jessica Parker', 'John Ford', 'Richard Linklater', 'actors', 'bad dialogue', 'Katherine Heigl', 'not on DVD', 'playwright:Shakespeare', 'John C. Reilly', 'consumerism', 'teenager', 'lame', 'matter-of-fact', 'artist', 'Michael Keaton', 'series', 'gentle', 'forest', 'robot', 'biology', 'Michelle Pfeiffer', 'MTSKAF', 'fairy tales', \"adapted from B'way\", 'Dimensionalized 2-D to 3-D', 'glbt', 'court', 'bad cgi', 'Added', 'Irish', 'mummy', 'doctors', 'stephen king', 'heroism', 'nudity (topless - notable)', 'chess', 'Shia LaBeouf', 'Dakota Fanning', 'vikings', 'sniper', 'stupid ending', 'slow start', 'Spaghetti Western', 'my shopping list', 'Kurosawa', 'Marion Cotillard', 'single parents', 'INNOCENCE LOST', 'rats', 'Michael Mann', 'Toho', 'Iceland', 'restaurant', 'Javier Bardem', 'brainwashing', 'based on true story', 'Zach Braff', 'airplane', 'Jessica Biel', 'grindhouse', 'Frank Miller', 'Rowan Atkinson', 'math', 'trilogy', 'Dance', 'raunchy', 'Seen 2009', 'long takes', 'Ei muista', 'Demi Moore', 'hit men', 'Julie Andrews', 'bruce willis', 'Canada', 'geeky', 'unoriginal', 'very funny', 'astronomy', 'uplifting', 'bad sequel', 'Gus Van Sant', 'melodrama', 'Andrew Garfield', 'artsy', 'atmosphere', 'sarcasm', 'Harvey Keitel', 'shark', 'Leslie Nielsen', 'Jack Lemmon', 'abuse', 'Steve Coogan', 'Ingrid Bergman', \"WRITER'S LIFE\", 'Claire Danes', 'god', 'freedom', 'Ron Howard', 'segregation', 'sport:boxing', 'curse', 'Stupid as Hell', '2.5', 'smoking', 'adapted from:game', 'Paul Verhoeven', 'Jason Schwartzman', 'Nicholas Cage', 'POT_ESTAR_BE', 'hard to watch', 'giant monster', 'Judaism', 'Gerard Depardieu', 'Casey Affleck', 'lgbt', 'Myth', 'Frank Capra', 'weed', 'Comic Book adaption', 'Period piece', 'warm', 'nuns', 'private school', 'Elizabeth Banks', 'teleportation', 'man versus machine', 'murder mystery', 'Jeremy Renner', 'Alien Invasion', 'na dysku', 'Pedro Almodovar', 'SUPERNATURAL ROMANCE', 'unsimulated sex', 'costumes', 'good', 'toys', 'astronauts', 'visuals', 'radio', 'smuggling', 'Danny Boyle', 'clowns', 'supernatural powers', 'religious overtones', 'Dennis Hopper', 'breakthroughs', 'Seann William Scott', 'Lindsay Lohan', 'innovative', 'president', 'Dwayne Johnson', 'Rosario Dawson', 'factual', 'crazy', 'weapons', 'china', 'Tina Fey', 'punk', 'Owned', 'Michael Curtiz', 'tension', 'ironic', 'Judi Dench', 'Christian Slater', 'Lucy Liu', 'Howard Hawks', '10/09', 'Emerson must see', 'corporate America', 'Adolf Hitler', 'silly fun', 'Best War Films', 'Megan Fox', 'adapted from:book series', 'literature', 'Wallace & Gromit', 'drab', 'Edgar Wright', 'Godzilla Universe', 'Louis C.K.', 'shakespeare', 'want to own', 'Stanley Tucci', 'too small', 'United States', 'K movie', 'Golden Raspberry (Worst Actor)', 'business is the antagonist', 'irreligion', 'Afghanistan', 'Salma Hayek', 'transformation', 'Christopher Lee', 'lies', 'Charlie Sheen', 'to-rent', 'Ray Liotta', 'Korean', 'Carey Mulligan', 'FATHERS AND SONS', 'illogical', 'Danny Trejo', 'Eva Mendes', 'Eric Bana', 'art house', 'East Germany', 'exciting', 'easygoing', 'CAV', 'rhythm & blues', 'Janus 50', 'Boston', 'stupidity', 'Olympics', 'H.P. Lovecraft', 'Robert Altman', 'Kristen Wiig', 'Theater', 'Iraq', 'tokyo', 'Bette Davis', 'golf', 'Hilary Swank', 'John Candy', 'Sharon Stone', 'imaginative', 'destiny', 'Kenneth Branagh', 'Roger Moore', 'Tim Allen', 'Jared Leto', 'Direction', 'Gross-out', 'Animated', 'Kristen Bell', 'Israel', 'Character study', 'feel good', 'Susan Sarandon', 'lurid', 'cool', 'banned movie', 'jay and silent bob', 'spaceships', 'Monica Bellucci', 'seafaring', 'Beautiful Woman', 'jonossa', 'Cinematography', 'apocalyptic', 'cameos', 'character study', 'Parody', 'terminal illness', 'poor acting', 'Michelle Williams', 'Chevy Chase', 'lawyer', 'BFI classic', 'exploitation', 'Bill Hader', 'thrilling', 'time-travel', 'Golden Raspberry (Worst Actress)', \"Didn't finish\", 'Tom Hardy', 'sacrifice', 'stage magic', 'SUBURBAN DYSFUNCTION', 'mvlc', 'arms dealer', 'Wizards', 'Soviet Union', 'medicine', 'want it', 'animal:dog', 'cliche characters', 'Rob Schneider', 'anti-Semitism', 'Western', 'BFI modern classic', 'dog', 'comics', 'Santa Claus', 'based on a video game', 'Cartoon', 'solitude', 'music documentary', '3.5', 'unusual', 'Billy Crystal', 'strong woman', 'Helen Hunt', 'ending twist', 'utopia', 'Tim Curry', 'updated classics', 'irish accent', 'mining', 'eccentricity', 'life', 'California', 'psychological thriller', 'Wesley Snipes', 'PROSTITUTES', 'Julia Stiles', 'Hilarious', 'editing', 'memory loss', 'magic realism', 'race', 'Noomi Rapace', 'scenery', 'BEST PICTURE', 'ethics', 'middle east', 'SIBLING RELATIONSHIPS', 'Oscar (Best Sound)', 'interesting characters', 'Vincent Cassel', 'Josh Brolin', 'ecology', 'BBC Films', 'blindfold', 'Death', 'Disturbing', 'sword fighting', 'Alec Guinness', 'Saturday Night Live', 'breaking the fourth wall', 'metaphysics', 'Toshirô Mifune', 'opera', \"so bad it's funny\", 'totalitarianism', 'sex scenes', 'impostor', 'choir', 'Michael Haneke', 'author:Stephen King', 'funeral', 'star wars', 'kids and family', 'election', 'teacher student relationship', 'shooting', 'adapted from:TV series', 'Charles Chaplin', 'Patrick Swayze', 'spanish', 'oil', 'Robert Pattinson', 'Oscar (Best Animated Feature)', 'crap', 'Anna Kendrick', 'Lovecraftian mythology', 'flying', 'silent', 'cathartic', 'hugh jackman', 'london', 'villain nonexistent or not needed for good story', 'heavy metal', 'goth', 'getdvd', \"don't remember\", 'teenage girl', 'father son relationship', 'toplist07', 'animation remade as live action', 'Old', 'simple', 'pulp', 'elegant', 'inaccurate', 'shallow plot', 'bowling', 'Judd Apatow', 'video game', 'Diane Kruger', 'witches', 'Melancholic', 'nudity (rear)', 'Jeff Daniels', 'mob', 'Peter Pan', 'Josh Hartnett', 'camerawork', 'masturbation', 'Federico Fellini', 'nothing happens', 'Carrie-Anne Moss', 'Neil Gaiman', 'Antarctica', 'priest', 'shallow characters', 'National Lampoon', 'Rose Byrne', 'Tatsuya Nakadai', 'Heather Graham', 'Saturn Award (Best Science Fiction Film)', 'Jennifer Lopez', '10/11', 'not scary', 'period piece', 'Mike Leigh', 'psychic powers', 'Gérard Depardieu', 'reality TV', 'almost favorite', 'ummarti2007', 'contrived', 'Neo-Nazis', 'angry', 'idiotic', 'Dennis Quaid', 'author:J. R. R. Tolkein', 'powerful', 'CLASS DIFFERENCES', 'Heartwarming', '2', 'Complicated', 'underwater', 'few funny scenes', 'Zack Snyder', 'sisters', 'childish plot', 'Christopher Plummer', 'hunting', 'Swedish', 'Bruce Lee', 'new york city', 'resistance movement', 'cheating', 'short film', 'tattoo', 'Comedy Need to See', 'skinhead', '17th century', 'spy thriller', 'deal with the devil', 'William Wyler', 'coming-of-age', 'ryan gosling', 'yakuza', 'Zombies', 'Pixar animation', 'Cute', 'Louisiana', '1940s', 'graphic design', 'screwball', 'SPACE TRAVEL', 'oppl', 'swearing', 'invisibility', 'punk rock', 'wasted potential', 'need to buy', 'Children', 'Alec Baldwin', 'etaegeshelf', 'surfing', 'John Grisham', 'angel', 'quentin tarantino', 'wartime', 'geek', 'Egypt', 'special', 'rich and poor', 'tragic', 'Golden Raspberry (Worst Picture)', 'US history', 'Dolph Lundgren', 'sexist', 'cross dressing men', 'funny moments', 'Roberto Benigni', 'Charles Dickens', 'wheelchair', 'moving', 'Liv Tyler', 'Spanish Civil War', 'Kat Dennings', 'John Huston', 'family dynamics', 'Katherine Hepburn', 'Paul Bettany', 'Oscar Nominee: Director', 'Very interesting', 'visual effects', 'superficial', 'formulaic', 'Moving', 'too short', 'free to download', 'upbeat', 'poor plot', '(s)vcd', 'drug use', 'freedom of expression', 'Edgar Allan Poe', 'Bittersweet', 'sexism', 'Acting', 'annoying', 'holiday', 'nuclear bomb', 'fugitive', 'Julie Delpy', 'gender identity', 'animal attacks', 'Sam Neill', \"joseph campbell's study of mythology influenced\", 'assassins', 'author:Philip K. Dick', 'Hans Zimmer', 'piano', 'Do kupienia', 'gratuitous violence', 'Adaptation', 'Jeremy Irons', 'symbolism', 'not true to the book', 'quirky romantic', 'horror comedy', 'Hugh Dancy', 'hockey', 'Justin Long', 'misogyny', 'Aaron Eckhart', 'coming out', 'robert de niro', 'brad pitt', 'H.G. Wells', 'horrible', 'stoner comedy', 'nonsensical', 'Chris Tucker', 'Michelle Rodriguez', 'surprisingly good', 'alan rickman', 'denzel washington', 'Great movie', 'Bill Nighy', 'Giallo', 'Halloween', 'Voodoo', 'Roland Emmerich', 'Jinni Top Pick', 'old age', 'education', 'scifi cult', 'neighbors', 'world war II', 'annoying characters', 'enjoyable', 'Kevin Kline', 'Danny McBride', 'Sam Peckinpah', 'teen pregnancy', 'Greek mythology', 'buy', 'BORING!', 'board game', 'Tennessee Williams', 'Chuck Norris', 'allegory', 'Anna Faris', 'stop-motion', 'satanism', 'cycling', 'break-up', 'Shirley Temple', 'human nature', 'Harold Ramis', 'Chris Hemsworth', 'Barbara Stanwyck', 'J.J. Abrams', 'Hayden Christensen', 'Favorite', 'genetic engineering', 'Wim Wenders', 'brilliant', 'parkour', 'nightclub', 'pigs', 'oppression', 'exorcism', 'Frances McDormand', 'William H. Macy', 'Juliette Lewis', 'graphic violence', 'frantic', 'TV series', 'offensive', 'Henry Fonda', 'mecha', 'Veja', 'Amy Smart', 'Great Screenplays', \"Peter O'Toole\", 'Macaulay Culkin', 'Kung Fu', 'great music', 'Roald Dahl', 'Caribbean', 'Wes Craven', 'Amanda Seyfried', 'good story', 'Poland', 'rabbits', 'lions', 'interesting concept', 'psychiatrist', 'preachy', 'one day', 'Laurence Fishburne', 'Jane Fonda', 'mountains', 'bond', 'Kate Hudson', 'New Jersey', 'family drama', 'genital mutilation', 'mexico', 'Marilyn Monroe', 'Dr. Seuss', 'Vietnam war', 'killer-as-protagonist', 'cate blanchett', 'crude', 'Sarah Polley', 'Olivia Wilde', 'creative', 'unreliable narrators', 'Saturn Award (Best Special Effects)', 'wormhole', 'watch', 'fraternity', 'Southern theme', 'Rutger Hauer', 'chemistry between actors', 'humour', 'Super-35 Blow-Up', 'Oscar Nominee', 'Kubrick', 'Manhattan', 'gay relationship', 'killer as protagonist', 'Jim carrey', 'love triangle', 'Hunter S. Thompson', 'Strong Women', 'village', 'humanity', 'Great Depression', 'unbelievable', 'Gilliam', 'Tom Waits', 'maze', 'Charlton Heston', 'movie to see hd', 'snow', 'intrigue', 'economics', 'Menahem Golan', 'meryl streep', 'letters', 'sappy', 'fake documentary', 'Romania', 'rural', 'Erotic', 'Frightening', 'Martin Sheen', 'waste of time', 'in netflix queue', 'Harry Potter', 'spiritual journey', 'Great Soundtrack', 'Isaac Asimov', 'James Mason', 'King Arthur', 'memories', 'war movie', 'Kate', 'Robert Ludlum', 'bugs', 'good ending', 'Bob Dylan', 'gun fu', 'excellent', 'fate', 'comic books', 'Sweden', 'excellent cinematography', 'Kieran Culkin', 'Film Theory & Criticism', 'coen bros', 'John Landis', 'feminist', 'intelligent thriller', 'massacre', 'great photograpy', 'Oscar (Best Documentary Feature)', 'morgan freeman', 'interviews', 'Seen 2010', 'simplistic', 'Mars', 'existential', 'incoherent', 'Family', 'polyamory', 'style over substance', '2014', 'Trey Parker', 'sword and sorcery', 'Jules Verne', 'MOLT_CRITICADA', 'urban', 'theater', 'Rio de Janeiro', 'Timothy Dalton', 'Tom Hiddleston', 'Drinking', 'retro', 'photographer', 'Catchy Score', 'split personality', 'Legenda PT-BR', 'Almodovar', 'Wolfgang Petersen', 'Oscar (Best Writing - Screenplay Written Directly for the Screen)', 'facebook rec', 'TRAPPED OR CONFINED', 'homage', 'maybe', 'Cuba', 'HEROIC MISSION', 'university', 'delights', 'foqam', 'David O. Russell', 'Brittany Murphy', 'Superman', 'lesbian subtext', 'good concept', 'Emily Mortimer', 'slow motion', 'madness', 'anthology', 'identity crisis', 'subgenre:slasher', 'Arthurian legend', 'Nathan Fillion', \"80's\", 'Minnesota', 'Korea', 'Martial Arts', 'Agatha Christie', 'Benedict Cumberbatch', 'magical realism', 'stalker', 'documentary critique', 'global warming', 'Marisa Tomei', 'alternate timeline', 'Jewish', 'cover up', 'everything', 'creativity', 'awful', 'Ernst Lubitsch', 'painting', 'biblical', 'loss', 'Sam Mendes', 'Lance Henriksen', 'metafiction', 'wwii', 'Truman Capote', 'Iron Man', 'sensual', 'smart comedy', 'cultural references', 'George Cukor', 'Dream Sequence', 'better than the american version', 'Paul Walker', 'Diane Keaton', 'class issues', 'James Cagney', '1.5', 'conversation', 'post-traumatic stress disorder', 'added', 'man vs. nature', 'Venice', 'environment', 'growing up', 'heroine', 'Firefly', 'Buddy movie', 'SF', 'UNLIKELY FRIENDSHIPS', 'youth', 'france', 'Japanese', 'self-sacrifice', 'Mike Nichols', 'cowboys', 'fish out of water', 'Rick Moranis', 'Timothy Olyphant', 'Greece', 'rare', 'Love story', 'bears', 'Milos Forman', 'penguins', 'ending kinda ruined it', 'mask', 'weddings', 'street race', 'anthony hopkins', 'Incest', 'Excellent use of dialogue', 'Luke Wilson', 'exaggerated', 'Jay Baruchel', 'Rainer Werner Fassbinder', 'Politics', 'Stephen Fry', 'tongue-in-cheek', 'Sydney Pollack', 'landscape', 'food/cooking', 'twisted ending', 'obvious', 'Abigail Breslin', '90s', 'Martin Freeman', 'shipwreck', 'Jim Henson', 'Seen 2007', 'ultra-violence', 'colorful', 'single father', 'Prohibition', 'bab cinema', 'Old Tucson Studios', 'coincidences', 'Madness', 'Twist Ending', 'Sarah Michelle Gellar', \"Tumey's VHS\", 'Vienna', 'ambiguous ending', 'DVD Collection', 'Anton Yelchin', 'based on a poem', 'nihilism', 'Do zassania', 'closeted homosexual', 'deceased family member', 'Patricia Arquette', 'Tom Clancy', 'last man on earth', '3D version', 'The Avengers', 'Stephen Frears', '03/09', 'stunning', 'current', 'Max von Sydow', 'dvd-r', 'subway', 'Jason Reitman', '70mm blowup', 'Bad direction', 'princess', '80s nostalgia', 'G', 'office', 'Franka Potente', 'videogame', 'monkey', 'good plot', 'Rob Reiner', 'angelina jolie', 'convoluted', 'german expressionism', 'shocking', 'Carl Weathers', 'insomnia', 'tasteless', 'Washington DC', 'beer', 'Chris Rock', 'Sidney Poitier', 'Mary-Louise Parker', 'female warriors', 'tennis', 'James Gandolfini', 'lengthy', 'made me cry', 'Favorites', 'single mother', 'ethnic conflict', 'italy', 'extremely violent', 'Hugo Award', 'language', 'advertising', 'classic car', 'repetitive', '4', 'ghost story', 'Anime', 'Jews', 'stylistic', 'Ending', 'Seth Green', 'Friendship', 'Blake Edwards', 'hope', 'domestic violence', 'intelligent humor', 'Satoshi Kon', 'paris', 'unsimulated sex scenes', 'adult humor', 'David Cross', 'gadgets', 'Jonathan Demme', 'Grace Kelly', 'political commentary', 'BREAKUPS AND DIVORCES', '2015', 'stunts', 'accident', 'unsatisfying ending', 'Andy Garcia', 'boat', 'Terrence Malick', 'africa', 'girl power', 'goretastic', 'Mark Strong', 'dog killing', 'cross dressing women', 'child actor', 'Florida', 'Oscar (Best Editing)', 'Dario Argento', 'HIGH SCHOOL LIFE', 'naive', 'Alcatraz', 'Jesus Christ', 'emotion!', 'C.S. Lewis', 'wolves', 'Captain America', 'distopia', 'pre-code Hollywood', 'dumb but funny', 'Jena Malone', 'archive footage', 'multiple personalities', 'gypsy', 'terrible', 'Bill Paxton', 'Matt Dillon', 'Danny Kaye', 'terrible acting', 'suprisingly clever', 'sandra bullock', 'Rosamund Pike', 'author:Alan Moore', 'Dani2006', 'ambitious', 'Violence', 'historical epic', 'pop culture references', 'shape shifter', 'invasion', 'Evan Rachel Wood', 'tarantino', 'Saoirse Ronan', 'abduction', 'Philosophical', 'courage', 'author:Charles Dickens', 'unconventional', 'Story', 'DRUG ADDICTION', 'confusing plot', 'Magic', 'weak story', 'trash', 'Astaire and Rogers', 'Joss Whedon', 'beauty pageant', 'Academy Award Nominee', 'indians', 'bad', 'great story', 'Leslie Mann', 'sword and sandal', 'score', 'party', 'Nick Hornby', 'The Beatles', 'gripping', 'EXTRAMARITAL AFFAIRS', 'Inspiring', 'latin music', 'Nazi Germany', 'voice acting', 'winona ryder', 'anthropomorphic', 'Sofia Coppola', 'multiple realities', 'Full Moon Entertainment', 'stage', 'DC', 'John Belushi', 'bloggers', 'storm', 'courtesan', 'Bela Lugosi', 'Alicia Silverstone', 'concept', 'superpowers', 'FilmAffinity top pick', 'lesbians', 'civil rights', 'chilly', 'sketch comedy', 'dated', 'Vera Farmiga', 'climate change', 'Edgar Rice Burroughs', 'Nudity', 'ECCENTRIC FAMILIES', 'Sundance award winner', 'marx brothers', 'injustice', 'Bill Pullman', 'life philosophy', 'clones', 'Classical Studies', 'Dystopia', 'Jon Favreau', 'Justin Timberlake', 'WORK ETHICS', 'Richard Matheson', 'folk music', 'poetic', 'bland', 'Tom Wilkinson', 'Baz Luhrmann', 'ZAZ', 'David Lean', 'wine', 'watch again', 'Toni Collette', 'Jason Mewes', 'Don Siegel', 'nationalism', 'Emile Hirsch', 'Rupert Grint', '1', 'spaceflight', 'experiment', 'Kathy Bates', 'deafness', 'United Nations', 'buddhism', 'dolls', 'old', '2000s', 'Political', 'James Spader', 'Drugs', 'setting:NYC', 'UNREQUITED LOVE', 'Psychological horror', 'good action', 'Joe Pesci', 'OBSESSIVE QUESTS', 'activism', 'Unexpected Ending', 'dictatorship', 'Turkey', 'crime gone awry', 'child killing', 'Science Fiction', 'poets', 'Louis Malle', 'no nudity', 'Brooklyn', 'sean penn', 'Deep', 'beautiful effects', 'sex scene', 'rivalry', 'makes you think', 'relativity', 'Joe Dante', 'unresolved', 'Eric Rohmer', 'Christian', 'Gene Kelly', 'no happy ending', 'water', 'homophobic', 'iraq', 'sequel better than original', 'Zac Efron', 'Jessica Chastain', 'Luis Buñuel', 'prejudice', 'Oscar Nominee: Adapted Screenplay', 'haunting', 'Oscar Wilde', 'Playlist', 'complex script', 'Philadelphia', 'Richard Dreyfuss', 'profanity', 'actually funny', 'obsessive compulsive disorder', 'Yasujirô Ozu', 'sadism', 'corporations', 'Charlotte Gainsbourg', 'political corruption', 'Stephen Chow', 'liberal propaganda', 'tearjerking', 'Indiana Jones', 'Cuba Gooding Jr.', 'Albert Finney', 'cops', 'Visuals', 'corporate espionage', 'Jean Renoir', 'François Truffaut', 'forgettable', 'beach', 'baby', 'cave', 'Gulf War', 'Breathtaking', 'Peter Cushing', 'childhood classics', 'bromance', 'acting debut', 'River Phoenix', 'Nick Nolte', 'Aardman studios', 'Vinnie Jones', 'perfect', 'government', 'gods', 'simple plot', 'librarians', 'sailing', 'lovecraftian', 'mental health', 'Francis Ford Copolla', 'sister sister relationship', 'well done', 'story-in-a-story', 'Buddhism', 'French New Wave', 'NC-17', 'Boris Karloff', 'Michael Winterbottom', 'Based on a book', 'Neil Simon', '16th century', 'figure skating', 'cat killing', 'hillarious', 'will ferrell', 'who cares DVDs', 'hypnosis', 'Amanda Peet', 'Patton Oswalt', 'classical', 'heaven and hell', 'Detroit', 'Isolation', 'Thailand', 'Eli Roth', 'time-lapse', 'Gemma Arterton', 'American propaganda', \"So bad it's good\", 'not available from Netflix streaming', 'Charles Bronson', 'Seen 2011', 'odd', 'Katie Holmes', 'millenial foreign comedies to see', 'dwarf', 'Catherine Keener', 'Remake', \"Vincent D'Onofrio\", 'ben stiller', 'faith', 'Elmore Leonard', 'indecent', 'political satire', 'Barcelona', 'FIRST LOVE', 'Own', 'Slashy', 'Idris Elba', 'Ken Watanabe', 'val kilmer', 'Bette Midler', 'Juliette Binoche', 'the book was better', 'highly quotable', 'hell', 'sequels filmed simultaneously', 'flashy', 'Matthew Perry', 'Enigmatic', 'life choices', 'Vulgar', 'shopping', 'family gatherings', 'style', 'excellent acting', 'runaway', 'Surrealism', 'Chow Yun Fat', 'Dick Van Dyke', 'Chinese', 'Harold Lloyd', 'Piper Perabo', 'trial', 'Sam Worthington', 'Navy', 'country music', 'chocolate', 'stupid stereotypes', 'happy', 'DOCTORS AND PATIENTS', 'WITCHCRAFT', 'architecture', 'mutation', 'great', 'weak characters', 'infection', 'Joseph L. Mankiewicz', 'blackmail', 'trippy', 'spelling bee', 'vacation', 'lighthearted', 'toplist08', 'black humour', 'non-hollywood ending', 'William A. Wellman', 'Mervyn LeRoy', 'Fred Zinnemann', 'rotoscoping', 'city under attack', 'afghanistan', 'NOTHING GOES RIGHT', 'alone', 'Robert Wise', 'cliched', 'idealistic', 'bad adaptation', 'pedophile', 'LOVE TRIANGLES', 'Peter Sarsgaard', 'David Duchovny', 'mother-daughter relationships', 'fish', 'german', 'BOUNTY HUNTERS', 'Bernardo Bertolucci', 'Bryan Singer', 'favorite', 'homeless', 'Cute!', 'Takashi Shimura', 'great concept', 'action spoof', 'blind badass', 'katsomattomat', 'mystic warriors', 'bombs', 'bromantic', 'fountain of youth', 'germany', 'hard to rate', 'bright', 'starship pilots', 'quest', 'Amy Poehler', 'antihero', 'multinational settings', 'Clarence Brown', 'tcm', 'choreographic violence', 'Elizabeth Taylor', 'CONS AND SCAMS', 'Annette Bening', 'author:Jane Austen', 'brazil', 'ugly', 'Errol Morris', 'Andy Samberg', 'dream', 'Ken Loach', 'Yahoo Top Pick', 'pseudo-intelligent', 'Saturn Award (Best Writing)', 'Bollywood', 'Gary Sinise', 'cdon', 'storyline', 'Basil Rathbone as Sherlock Holmes', 'misfits', 'implausible', 'quiet', 'Victorian era', 'jennifer aniston', 'special forces', 'Joel Schumacher', 'distorted reality', 'Minnie Driver', 'sexy girls', 'Suicide', 'skateboarding', 'Glenn Close', 'Martin Lawrence', 'parenthood', 'autumnal', 'Barry Levinson', '70s', 'Brat Pack', 'Sports Coordinator:Mark Ellis', 'George Orwell', 'rate down', 'Parallel universe', 'underdogs', 'The Chosen One', 'Depressing', 'Creature Feature', 'painful to watch', \"Tumey's To See Again\", 'Psychopathy', 'John Hurt', 'group psychology', 'on disc', 'John Waters', 'media', 'Spike Jonze', \"Walked out/didn't finish\", 'derivative', 'Danny Glover', 'Lucas', 'psychiatrist as protagonist', 'futurama', 'occult technology', 'little dialogue', 'Victor Hugo', 'Slow', 'telekinesis', 'fbi', 'overacting', 'show business', 'Michael Apted', 'Zombie', 'mentor/trainer', 'Mandy Moore', 'to see: horror', 'toplist11', 'Farrelly Brothers', 'stoner movie', 'Michael Powell', 'angels', 'Sex Comedy', 'Greta Garbo', 'Wixom Library', 'setting:LA', 'PSYCHIC ABILITIES', 'mermaid', 'graphic novel', 'scandal', 'obesity', 'transhumanism', 'homosexual undertones', 'Henry King', 'Gary Cooper', 'cameo-fest', 'Kenji Mizoguchi', 'Ving Rhames', 'toplist09', 'drug trade', 'Roman empire', 'sick', 'upper class', 'boxing drama', 'like-i-like top pick', 'Crispin Glover', 'boys', 'Jean-Pierre Jeunet', 'Atheism', 'broadway', 'W.S. Van Dyke', 'compelling', 'EXPERIMENTS GONE AWRY', 'Black Comedy', 'Klaus Kinski', 'Alejandro Gonzalez Inarritu', 'lame ending', 'William Hurt', 'being a kid again', 'Memorable Characters', 'performances', 'Anthony Mann', 'slow pacing', 'aerial dogfights', 'disjointed timeline', 'bureaucracy', 'affair', 'macho', 'fake', 'coma', 'double agents', 'unnecessary sequel', 'spirits', 'brutal violence', 'to see: b-grade horror', 'Carl Sagan', \"so bad it's almost good\", 'toplist12', \"always watch it when it's on tv\", 'lovecraft', 'Cary Elwes', 'SEXUAL AWAKENING', 'riveting', \"Frankenstein's monster\", 'SURVIVAL', 'sunny', 'Michelle Monaghan', 'Goldie Hawn', 'NR', 'Rob Zombie', 'sport:American football', 'ambition', 'dissapointing', 'sg', 'Carrie Fisher', 'premonition', 'Jennifer Jason Leigh', 'Doctor Who', \"Nostalgia Critic's Top 20\", 'Fantasy World', '5 stars', 'native americans', 'wrestling', 'news media', 'Clea DuVall', 'amateur detective', 'flat characters', 'Malcolm McDowell', 'Comic Book', 'multiple interpretations', 'india', 'poor dialogue', 'nuclear', 'boot camp', 'classic comedy', 'unlikable characters']\n"
     ]
    }
   ],
   "source": [
    "tag_value_counts = df_tag['tag'].value_counts()\n",
    "top_tags=tag_value_counts[\n",
    "    tag_value_counts>=20\n",
    "].index.tolist()\n",
    "print(top_tags)\n",
    "df_tag = df_tag[\n",
    "    df_tag['tag'].isin(top_tags)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65</td>\n",
       "      <td>208</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>65</td>\n",
       "      <td>353</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>65</td>\n",
       "      <td>521</td>\n",
       "      <td>noir thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>65</td>\n",
       "      <td>592</td>\n",
       "      <td>dark hero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>65</td>\n",
       "      <td>668</td>\n",
       "      <td>bollywood</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId            tag\n",
       "1      65      208      dark hero\n",
       "2      65      353      dark hero\n",
       "3      65      521  noir thriller\n",
       "4      65      592      dark hero\n",
       "5      65      668      bollywood"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 为标签名称做索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "tag_name_to_index_dict={}\n",
    "tag_index_to_name_dict={}\n",
    "\n",
    "for index,tag_name in enumerate(top_tags):\n",
    "    tag_name_to_index_dict[tag_name] = index\n",
    "    tag_index_to_name_dict[index]= tag_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lv/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65</td>\n",
       "      <td>208</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>65</td>\n",
       "      <td>353</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>65</td>\n",
       "      <td>521</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>65</td>\n",
       "      <td>592</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>65</td>\n",
       "      <td>668</td>\n",
       "      <td>721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  tag\n",
       "1      65      208  428\n",
       "2      65      353  428\n",
       "3      65      521  233\n",
       "4      65      592  428\n",
       "5      65      668  721"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tag['tag'] = df_tag['tag'].apply(lambda tag_name:tag_name_to_index_dict[tag_name])\n",
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>tag_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65</td>\n",
       "      <td>208</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>65</td>\n",
       "      <td>353</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>65</td>\n",
       "      <td>521</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>65</td>\n",
       "      <td>592</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>65</td>\n",
       "      <td>668</td>\n",
       "      <td>721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  tag_index\n",
       "1       65       208        428\n",
       "2       65       353        428\n",
       "3       65       521        233\n",
       "4       65       592        428\n",
       "5       65       668        721"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tag.columns = ['user_id','movie_id','tag_index']\n",
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 在df_tag中，有多少用户、电影和标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用户数 6731\n",
      "电影数 16934\n",
      "标签数 2952\n"
     ]
    }
   ],
   "source": [
    "user_quantity = len(df_tag['user_id'].unique())\n",
    "movie_quantity = len(df_tag['movie_id'].unique())\n",
    "tag_quantity = len(top_tags)\n",
    "print('用户数',user_quantity)\n",
    "print('电影数',movie_quantity)\n",
    "print('标签数',tag_quantity)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 建立用户标签矩阵，行是用户的索引，列是标签的索引，值是用户使用过多少这个标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id_to_index_dict = {}\n",
    "user_index_to_id_dict = {}\n",
    "\n",
    "# 初始化一个0矩阵\n",
    "user_tag_array = np.zeros(shape=(user_quantity,tag_quantity),dtype='i1')\n",
    "\n",
    "for index,(user_id,groupby_userid) in enumerate(df_tag.groupby('user_id')):\n",
    "    user_id_to_index_dict[user_id] = index\n",
    "    user_index_to_id_dict[index] = user_id\n",
    "    \n",
    "tag_value_counts = groupby_userid['tag_index'].value_counts()\n",
    "line_data = np.zeros(shape=tag_quantity,dtype='i1')\n",
    "for tag_index in tag_value_counts.index:\n",
    "    line_data[tag_index] = tag_value_counts[tag_index]\n",
    "user_tag_array[index] = line_data\n",
    "if index % 100 == 0:print(index,end='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_tag_array"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 建立电影标签表，行是电影的索引，列是标签的索引，值是电影被打过这个标签的次数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "movie_id_to_index_dict = {}\n",
    "movie_index_to_id_dict = {}\n",
    "\n",
    "# 初始化一个0矩阵\n",
    "movie_tag_array = np.zeros(shape=(movie_quantity,tag_quantity),dtype='i1')\n",
    "\n",
    "for index,(movie_id,groupby_movieid) in enumerate(df_tag.groupby('movie_id')):\n",
    "    movie_id_to_index_dict[movie_id] = index\n",
    "    movie_index_to_id_dict[index] = movie_id\n",
    "    \n",
    "tag_value_counts = groupby_userid['tag_index'].value_counts()\n",
    "line_data = np.zeros(shape=tag_quantity,dtype='i1')\n",
    "for tag_index in tag_value_counts.index:\n",
    "    line_data[tag_index] = tag_value_counts[tag_index]\n",
    "movie_tag_array[index] = line_data\n",
    "if index % 100 == 0:print(index,end='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_tag_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_tag_array"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 变更df_tag中的user_index，movie_id为movie_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "0",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-37-70ca14003998>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0muser_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0muser_id_to_index_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'movie_id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'movie_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mmovie_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mmovie_id_to_index_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmovie_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, convert_dtype, args, **kwds)\u001b[0m\n\u001b[1;32m   3847\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3848\u001b[0m                 \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3849\u001b[0;31m                 \u001b[0mmapped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconvert_dtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3850\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3851\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmapped\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmapped\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m<ipython-input-37-70ca14003998>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(user_id)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0muser_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0muser_id_to_index_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'movie_id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'movie_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mmovie_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mmovie_id_to_index_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmovie_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mdf_tag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 0"
     ]
    }
   ],
   "source": [
    "df_tag['user_id'] = df_tag['user_id'].apply(lambda user_id:user_id_to_index_dict[user_id])\n",
    "df_tag['movie_id'] = df_tag['movie_id'].apply(lambda movie_id:movie_id_to_index_dict[movie_id])\n",
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_index</th>\n",
       "      <th>movie_index</th>\n",
       "      <th>tag_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>16933</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>16933</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>16933</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>16933</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>16933</td>\n",
       "      <td>721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_index  movie_index  tag_index\n",
       "1           0        16933        428\n",
       "2           0        16933        428\n",
       "3           0        16933        233\n",
       "4           0        16933        428\n",
       "5           0        16933        721"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tag.columns = ['user_index','movie_index','tag_index']\n",
    "df_tag.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用户和电影的矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "01002003004005006007008009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700"
     ]
    }
   ],
   "source": [
    "# i1指int8\n",
    "user_movie_array = np.zeros(shape = (user_quantity,movie_quantity),dtype='i1')\n",
    "\n",
    "for user_index,groupby_userindex in df_tag.groupby('user_index'):\n",
    "    movie_indexs = groupby_userindex['movie_index'].unique().tolist()\n",
    "    line_data = np.zeros(shape = movie_quantity,dtype='i1')\n",
    "    for movie_index in movie_indexs:\n",
    "        line_data[movie_index] = 1\n",
    "    user_movie_array[user_index] = line_data\n",
    "    if user_index % 100 == 0:print(user_index,end='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1]], dtype=int8)"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_movie_array"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对热门标签做惩罚"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lv/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in true_divide\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       ...,\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan]])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_tag_array2 = np.around(user_tag_array/np.log(1+(user_tag_array > 0).astype(int).sum(axis=0)),3)\n",
    "user_tag_array2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对热门商品做惩罚"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lv/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/ipykernel_launcher.py:2: RuntimeWarning: invalid value encountered in true_divide\n",
      "  \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       ...,\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [nan, nan, nan, ..., nan, nan, nan],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.array([user_movie_array.sum(axis=0)]).T\n",
    "movie_tag_array2 = np.around(movie_tag_array/np.log(1+np.array([user_movie_array.sum(axis=0)]).T),3)\n",
    "movie_tag_array2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构建用户对商品的兴趣矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lv/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in greater\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lv/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: invalid value encountered in greater\n",
      "  # This is added back by InteractiveShellApp.init_path()\n"
     ]
    }
   ],
   "source": [
    "#where返回两个索引，一个行的索引，一个列的索引\n",
    "user_movie_fav_array = np.zeros(shape = (user_quantity,movie_quantity))\n",
    "\n",
    "for user_index in range(user_quantity):\n",
    "    # 拿到用户打过的标签索引向量\n",
    "    user_rated_tag_indexs =np.where(user_tag_array2[user_index]>0)[0].tolist()\n",
    "    # 用户对标签的喜好程度\n",
    "    user_rated_tag_values =user_tag_array2[user_index][user_rated_tag_indexs]\n",
    "    #被打过这些标签的电影索引\n",
    "    #a= movie_tag_array2[:,user_rated_tag_indexs]\n",
    "    taged_movie_indexs = np.where((movie_tag_array2[:,user_rated_tag_indexs]>0).astype(int).sum(axis=1)>0)[0].tolist()\n",
    "    sub_movie_tag_array = movie_tag_array2[taged_movie_indexs][:,user_rated_tag_indexs]\n",
    "    movies_fav =np.around(np.dot(sub_movie_tag_array,np.array([user_rated_tag_values]).T),3).T[0].tolist()\n",
    "    line_data = np.zeros(shape = movie_quantity)\n",
    "    for i,movie_index in enumerate(taged_movie_indexs):\n",
    "        line_data[movie_index] = movies_fav[i]\n",
    "    user_movie_fav_array[user_index] = line_data\n",
    "    print(user_index,end='')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### 生成用户推荐"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_recommand = {}\n",
    "\n",
    "for user_index in range(user_quantity):\n",
    "    user_recommand[user_index] =np.where(user_movie_fav_array[user_index]>1)[0].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "###读取打分表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rating = pd.read_csv('./data/rating.csv',usecols = [0,1,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rating.columns = ['user_id','movie_id','rating']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def deal_with_userid(user_id):\n",
    "    if user_id in user_id_to_index_dict.keys():\n",
    "        return user_id_to_index_dict[user_id]\n",
    "    else:\n",
    "        return None\n",
    "    \n",
    "    def deal_with_movieid(movie_id):\n",
    "        if movie_id in movie_id_to_index_dict.keys():\n",
    "            return movie_id_to_index_dict[user_id]\n",
    "        else:\n",
    "            return None\n",
    "        \n",
    "    df_rating['user_id'] = df_rating['user_id'].apply(deal_with_userid)\n",
    "    df_rating['movie_id'] = df_rating['movie_id'].apply(deal_with_movied)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>32</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating\n",
       "0        1         2     3.5\n",
       "1        1        29     3.5\n",
       "2        1        32     3.5\n",
       "3        1        47     3.5\n",
       "4        1        50     3.5"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_rating.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 20000263 entries, 0 to 20000262\n",
      "Data columns (total 3 columns):\n",
      " #   Column    Dtype  \n",
      "---  ------    -----  \n",
      " 0   user_id   int64  \n",
      " 1   movie_id  int64  \n",
      " 2   rating    float64\n",
      "dtypes: float64(1), int64(2)\n",
      "memory usage: 457.8 MB\n"
     ]
    }
   ],
   "source": [
    "df_rating.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rating = df_rating.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 20000263 entries, 0 to 20000262\n",
      "Data columns (total 3 columns):\n",
      " #   Column    Dtype  \n",
      "---  ------    -----  \n",
      " 0   user_id   int64  \n",
      " 1   movie_id  int64  \n",
      " 2   rating    float64\n",
      "dtypes: float64(1), int64(2)\n",
      "memory usage: 610.4 MB\n"
     ]
    }
   ],
   "source": [
    "df_rating.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>32</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating\n",
       "0        1         2     3.5\n",
       "1        1        29     3.5\n",
       "2        1        32     3.5\n",
       "3        1        47     3.5\n",
       "4        1        50     3.5"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_rating['user_id'] = df_rating['user_id'].astype(int)\n",
    "df_rating['movie_id'] = df_rating['movie_id'].astype(int)\n",
    "df_rating.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rating_columns = ['user_index','movie_index','rating']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 生成用户喜欢的电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'user_index'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-85-8fa1cd599890>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0muser_fav\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0muser_index\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgroupby_userindex\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_rating\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'user_index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mmovies_raing\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgroupby_userindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'movie_index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'rating'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     fav_movie_indexs = movies_rating[\n\u001b[1;32m      5\u001b[0m         \u001b[0mmovies_rating\u001b[0m \u001b[0;34m>=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mgroupby\u001b[0;34m(self, by, axis, level, as_index, sort, group_keys, squeeze, observed)\u001b[0m\n\u001b[1;32m   5805\u001b[0m             \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgroup_keys\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5806\u001b[0m             \u001b[0msqueeze\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5807\u001b[0;31m             \u001b[0mobserved\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobserved\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5808\u001b[0m         )\n\u001b[1;32m   5809\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated)\u001b[0m\n\u001b[1;32m    407\u001b[0m                 \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    408\u001b[0m                 \u001b[0mobserved\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobserved\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 409\u001b[0;31m                 \u001b[0mmutated\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmutated\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    410\u001b[0m             )\n\u001b[1;32m    411\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Downloads/Python/Practice/jupyter notebook practice/venv/lib/python3.7/site-packages/pandas/core/groupby/grouper.py\u001b[0m in \u001b[0;36mget_grouper\u001b[0;34m(obj, key, axis, level, sort, observed, mutated, validate)\u001b[0m\n\u001b[1;32m    596\u001b[0m                 \u001b[0min_axis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    597\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 598\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    599\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGrouper\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mgpr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    600\u001b[0m             \u001b[0;31m# Add key to exclusions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'user_index'"
     ]
    }
   ],
   "source": [
    "user_fav ={}\n",
    "for user_index,groupby_userindex in df_rating.groupby('user_index'):\n",
    "    movies_raing = groupby_userindex.groupby('movie_index')['rating'].mean()\n",
    "    fav_movie_indexs = movies_rating[\n",
    "        movies_rating >=3\n",
    "    ].index.tolist()\n",
    "    user_fav[user_index] = fav_movie_indexs\n",
    "    print(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### 计算准确率和召回率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "union_quantity = 0\n",
    "recommand_quantity = 0\n",
    "fav_quantity = 0\n",
    "\n",
    "for user_index in user_recommend.keys():\n",
    "    if user_index in user_fav.keys():\n",
    "        union_quantity +=len(\n",
    "            set(user_recommend[user_index]) & set(user_fav[user_index])\n",
    "        )\n",
    "        recommend_quantity += len(user_recommend[user_index])\n",
    "        fav_quantity += len(user_fav[user_index])\n",
    "        \n",
    "print('precision',union_quantity / recommend_quantity)\n",
    "print('recall',union_quantity / fav_quantity)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
